UHF_zipcode = 
  read_csv("./data/appb.csv") %>% 
  slice(-43) %>% 
  select(-Borough) %>% 
  rename("UHF" = "UHF Neighborhood") %>% 
  janitor::clean_names()
raw_hiv = 
  GET("https://data.cityofnewyork.us/api/views/fju2-rdad/rows.csv") %>% 
  content("parsed") %>% 
  janitor::clean_names()

combine_hiv = 
  right_join(UHF_zipcode, raw_hiv, by = "uhf") %>%
  janitor::clean_names() %>% 
  separate(zip_code, into = c("zipcode1", "zipcode2", "zipcode3", 
                              "zipcode4", "zipcode5", "zipcode6", "zipcode7", "zipcode8",
                              "zipcode9"), sep = ", ") %>% 
  gather(key = zip_code, value = zipcode_value, zipcode1:zipcode9) %>% 
  filter(!is.na(zipcode_value)) %>% 
  rename("zipcode" = "zipcode_value") %>% 
  select(zipcode, everything(), -zip_code)
r = GET('http://data.beta.nyc//dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')
nyc_zipcode = readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)

zipcode_lat_lng = nyc_zipcode@data %>% 
  select(zipcode = postalCode, longitude, latitude) %>% 
  mutate(zipcode = as.character(zipcode))
  
combine_hiv1 = 
  full_join(zipcode_lat_lng, combine_hiv, by = "zipcode") %>% 
  mutate(longitude = as.numeric(longitude), latitude = as.numeric(latitude)) %>% 
  group_by(uhf) %>% 
  summarise(lng = mean(longitude),
            lat = mean(latitude)) %>% 
  filter(!(uhf == "Pelhem - Throgs Neck"))
pums_raw = read_csv("./data/selected_pums.csv")

temp = tempfile(fileext = ".xls")

dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/nyc_zcta10_to_puma10.xls"

download.file(dataURL, destfile = temp, mode = 'wb')

zcta_to_puma = readxl::read_xls(temp, sheet = 2) %>% 
  select(zcta = zcta10, puma = puma10) %>% 
  mutate(puma = as.numeric(puma))

dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/zip_to_zcta10_nyc_revised.xls"

download.file(dataURL, destfile = temp, mode = 'wb')
zip_to_zcta = readxl::read_xls(temp, sheet = 2) %>% 
  select(zipcode, zcta = zcta5) 
pums_data = 
  pums_raw %>% 
  select(puma = PUMA10, income = PINCP, year = ADJINC) %>% 
  filter(puma != -9)  # remove data from 2011 due to lack of area information

pums_data$year = recode(pums_data$year, 
                        "1042852" = "2012",
                        "1025215" = "2013",  
                        "1009585" = "2014", 
                        "1001264" = "2015")   

pums_data = 
  pums_data %>% 
  group_by(year, puma) %>% 
  summarise(mid_income = median(income, na.rm = TRUE)) %>% 
  ungroup()         # calculate yearly median income for each area
puma_to_zipcode = right_join(zip_to_zcta, zcta_to_puma, by = "zcta") %>%   # generaate a puma to zipcode file
  select(puma, zipcode)

income_zipcode = right_join(pums_data, puma_to_zipcode, by = "puma") %>%  # matching zipcode with median income data
  select(year, zipcode, mid_income) %>% 
  mutate(year = as.numeric(year))

combine_hiv_income = 
  left_join(combine_hiv, income_zipcode, by = c("year", "zipcode"))

income_hiv = combine_hiv_income

HIV diagnoses regression

Income vs HIV

income_hiv %>% 
  filter(year != "2011" & age != "All") %>%
  lm(hiv_diagnoses ~ borough + gender + age + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.983 0.302 3.252 0.001
boroughBrooklyn 0.297 0.281 1.060 0.289
boroughManhattan 3.091 0.331 9.332 0.000
boroughQueens -1.245 0.259 -4.811 0.000
boroughStaten Island -4.376 0.397 -11.016 0.000
genderMale 6.083 0.152 40.138 0.000
age20 - 29 9.600 0.262 36.576 0.000
age30 - 39 6.870 0.262 26.175 0.000
age40 - 49 4.627 0.262 17.627 0.000
age50 - 59 2.355 0.262 8.972 0.000
age60+ 0.427 0.262 1.626 0.104
mid_income 0.000 0.000 -17.851 0.000
income_hiv %>% 
  filter(year != "2011" & race != "All") %>%
  lm(hiv_diagnoses ~ borough + gender + race + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 1.514 0.514 2.945 0.003
boroughBrooklyn 0.357 0.493 0.724 0.469
boroughManhattan 3.710 0.582 6.376 0.000
boroughQueens -1.494 0.454 -3.287 0.001
boroughStaten Island -5.251 0.698 -7.527 0.000
genderMale 7.299 0.266 27.425 0.000
raceBlack 10.932 0.421 25.978 0.000
raceLatino/Hispanic 9.027 0.421 21.451 0.000
raceOther/Unknown -1.380 0.421 -3.278 0.001
raceWhite 3.628 0.421 8.621 0.000
mid_income 0.000 0.000 -12.197 0.000
income_plot = income_hiv %>% 
  filter(year != "2011") %>% 
  group_by(uhf, year) %>% 
  summarise(sum_hiv = mean(hiv_diagnoses), mid_in = median(mid_income)) %>% 
  ggplot(aes(x = mid_in, y = sum_hiv, color = year)) +
  geom_point() + 
  geom_smooth(method = lm) +
  theme_bw() +
  theme(legend.position = "None") +
  labs(
        title = "Income Influence on HIV Incidence",
        x = "Average income of each neighborhood",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      )
ggplotly(income_plot)

Interpretation:

According to the scatterplot of income vs HIV diagnoses, it is obvious that the points are mostly concentrated in low income neigbourhood and the number of HIV diagnoses in high average income area( > 60000/year) centered in less than 1000 cases/neigbourhood. This result is exactly what we expected, because low-income community are tend to have insufficient healthcare supply, less insurance coverage, poor education level and inadequate epidemiology awareness, which could jointly cause the relatively high HIV incidence. So, we intend to advocate the related authorities to spare more public health resource in low-to-median-income neigbourhood to raise public awareness of HIV prevention knowledge and increase budgets of medical facilities in those areas.

Limitations:

We can not visualize the effect of age and race at the same time.

HIV diagnosis rate

income_hiv %>% 
  filter(year != "2011" & age != "All") %>%
  lm(hiv_diagnosis_rate ~ borough + gender + age + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 17.851 1.511 11.811 0.000
boroughBrooklyn -12.078 1.403 -8.611 0.000
boroughManhattan 15.424 1.656 9.316 0.000
boroughQueens -22.620 1.293 -17.492 0.000
boroughStaten Island -30.524 1.985 -15.377 0.000
genderMale 39.822 0.757 52.582 0.000
age20 - 29 44.060 1.312 33.589 0.000
age30 - 39 33.215 1.312 25.322 0.000
age40 - 49 27.986 1.312 21.336 0.000
age50 - 59 13.841 1.312 10.552 0.000
age60+ -4.261 1.312 -3.249 0.001
mid_income -0.001 0.000 -18.326 0.000
income_hiv %>% 
  filter(year != "2011" & race != "All") %>%
  lm(hiv_diagnosis_rate ~ borough + gender + race + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.795 2.459 0.323 0.747
boroughBrooklyn -11.951 2.357 -5.070 0.000
boroughManhattan 22.135 2.783 7.955 0.000
boroughQueens -25.218 2.173 -11.603 0.000
boroughStaten Island -31.251 3.336 -9.367 0.000
genderMale 49.480 1.273 38.875 0.000
raceBlack 61.810 2.012 30.713 0.000
raceLatino/Hispanic 34.404 2.012 17.095 0.000
raceOther/Unknown 9.809 2.012 4.874 0.000
raceWhite 12.984 2.012 6.452 0.000
mid_income 0.000 0.000 -1.729 0.084
income_plot_diag_rate = income_hiv %>% 
  filter(year != "2011") %>% 
  group_by(uhf, year) %>% 
  summarise(sum_hiv_diagnosis_rate = sum(hiv_diagnosis_rate), mid_in = median(mid_income)) %>% 
  ggplot(aes(x = mid_in, y = sum_hiv_diagnosis_rate, color = year)) +
  geom_point() + 
  geom_smooth(method = lm) +
  theme_bw() +
  theme(legend.position = "None")

ggplotly(income_plot_diag_rate)